# -*- coding: utf-8 -*-
import scrapy
from bs4 import BeautifulSoup
from scrapy import Request
from lxml import etree
from scrapymodel.items import WeatherItem


class WeatherSpider(scrapy.Spider):
    name = 'weather'
    # allowed_domains = ['tianqihoubao']
    start_urls = ['http://www.tianqihoubao.com/lishi/']


    #解析http://www.tianqihoubao.com/lishi/网页,提取连接形式http://www.tianqihoubao.com/lishi/beijing.html
    def parse(self, response):
        soup = BeautifulSoup(response.text, 'lxml')
        citylists = soup.find_all(name='div', class_='citychk')
        for citys in citylists:
            for city in citys.find_all(name='dd'):
                url = 'http://www.tianqihoubao.com' + city.a['href']
                yield Request(url=url,callback = self.parse_citylist)

    #解析http://www.tianqihoubao.com/lishi/beijing.html网页，提取链接形式为http://www.tianqihoubao.com/lishi/tianjin/month/201811.html
    def parse_citylist(self,response):
        soup = BeautifulSoup(response.text, 'lxml')
        monthlist = soup.find_all(name='div', class_='wdetail')
        for months in monthlist:
            for month in months.find_all(name='li'):
                if month.text.endswith("季度:"):
                    continue
                else:
                    url = month.a['href']
                    url = 'http://www.tianqihoubao.com' + url
                    yield Request(url= url,callback = self.parse_weather)

    # 以xpath解析网页数据;
    def parse_weather(self,response):
        # 获取城市名称
        url = response.url
        cityname = url.split('/')[4]

        weather_html = etree.HTML(response.text)
        table = weather_html.xpath('//table//tr//td//text()')
        # 获取所有日期相关的数据，存储在列表中
        listall = []
        for t in table:
            if t.strip() == '':
                continue
            # 替换元素中的空格和\r\n
            t1 = t.replace(' ', '')
            t2 = t1.replace('\r\n', '')
            listall.append(t2.strip())
        # 对提取到的列表数据进行拆分，将一个月的天气数据拆分成每天的天气情况，方便数据插入数据库
        n = 4
        sublist = [listall[i:i + n] for i in range(0, len(listall), n)]
        # 删除表头第一行
        sublist.remove(sublist[0])
        # 将列表元素中的最高和最低气温拆分，方便后续数据分析，并插入城市代码

        for sub in sublist:
            if sub == sublist[0]:
                pass
            sub2 = sub[2].split('/')
            sub.remove(sub[2])
            sub.insert(2, sub2[0])
            sub.insert(3, sub2[1])
            sub.insert(0, cityname)

            Weather = WeatherItem()

            Weather['cityname'] = sub[0]
            Weather['data'] = sub[1].replace('年','').replace('月','').replace('日','')
            Weather['tq'] = sub[2]
            Weather['maxtemp'] = sub[3]
            Weather['mintemp'] = sub[4]
            Weather['fengli'] = sub[5]
            yield Weather